# NB: IRB requirements mean we cannot share the raw tweets.  We provide this script to show the steps we took.

import pandas as pd
from ast import literal_eval
import reverse_geocoder as rg
import os
import numpy as np
from codecs import encode


threshold = .826   # This is the threshold that we use to label an image as containing protest or not.


def literal_converter(val):
    #literal_eval(val.replace("[[", '[').replace(']]', ']'))
    try:
        return literal_eval(val.replace("[[", '[').replace(']]', ']'))
    except:
        return str(val)



def read_data2(tweet_path):
    print('==> reading {}'.format(tweet_path))
    data = pd.read_csv(tweet_path)

    # Make this ugly loop because I could not figure out how to make the lambda function work in read_data
    if 'place.bounding_box.coordinates' in data.columns:
        temp = []
        for item in data['place.bounding_box.coordinates']:
            try:
                temp.append(literal_eval(item.replace("[[", '[').replace(']]', ']')))
            except:
                temp.append(item)

        data['place.bounding_box.coordinates'] = temp

    return data


def get_city_county(data):
    """ This function corrects locations using reverse_geocoder
    """
    print('====> correcting location')
    # Some rows are missing values, need to get rid of those.
    data = data.dropna(subset=['place.bounding_box.coordinates'])

    # REVERSE GEOCODE
    # Use this column to aggregate by coordinates.  Twitter adds a polygon when it does not use a specific point,
    data['mid_long'] = [(item[0][0] + item[2][0]) / 2.0 for item in data['place.bounding_box.coordinates']]
    data['mid_lat'] = [(item[0][1] + item[1][1]) / 2.0 for item in data['place.bounding_box.coordinates']]

    data['bounding_box.center_rg'] = [(item2, item1) for item1, item2 in zip(data['mid_long'], data['mid_lat'])]

    data['rg.reversegeocode_results'] = rg.search(data['bounding_box.center_rg'].tolist(), mode=1)

    data['rg.city'] = [item['name'] for item in data['rg.reversegeocode_results']]
    data['rg.state'] = [item['admin1'] for item in data['rg.reversegeocode_results']]
    data['rg.county'] = [item['admin2'] for item in data['rg.reversegeocode_results']]
    data['rg.cc'] = [item['cc'] for item in data['rg.reversegeocode_results']]

    # MAKE NEW VARIABLE THAT USES TWITTER OR REVERSE_GEOCODER DEPENDING ON PLACE_TYPE
    data['city_use'] = None

    # Where place_type is city, make city_use column the Twitter place name
    data.loc[data['place.place_type'] == "b'city'", 'city_use'] = data['place.name']
    # Otherwise - so where place_type is admin, poi, or neighborhood -  make the city the reverse_geocoder for city_use
    data.loc[data['place.place_type'] != "b'city'", 'city_use'] = data['rg.city']

    # Below makes items human readable
    temp = [item.replace("b'", '') for item in data['city_use']]
    temp = [item.replace("'", '') for item in temp]

    blah = []
    for item in temp:
        try:
            ugh = encode(item.encode().decode('unicode_escape'), "raw_unicode_escape")
            ugh = ugh.decode('utf-8')
            blah.append(ugh)
        except:
            #print(item)
            blah.append(item)

    data['city_use'] = blah

    return data

# Below is for Alex's adding of metadata to events Donghyeon extracted.
tweet_dir = './Data/01_rawData/'  # Directory with Alex's raw data
th = threshold
files = os.listdir(tweet_dir)
if '.DS_Store' in files:
    files.remove('.DS_Store')
files = [item for item in files if 'tweet_metadata' in item]  # Do not run on files already processed
files = [item for item in files if 'PH_' not in item]  # Alex named the Philippines data, which Donghyeon did not pull, with the same ending he used for the events that Donghyeon did pull.  Will have different metadata.

# Below is getting files Donghyeon made, will load so can merge with Alex.
tweet_dir2 = './Data/01_rawData/tweets_output_new/'
files2 = os.listdir(tweet_dir2)
if '.DS_Store in files':
    files2.remove('.DS_Store')
files2 = [item for item in files2 if '.csv' in item]
files2 = [item for item in files2 if 'WM_' not in item]  # Don't want Women's March


# Build dictionary matching the files to each other.
# Will go from Donghyeon's events (files2) to finding match in Alex's tweets.
toLoad = {}
for item in files2:
    event = item[:-4]
    index = [i for i, s in enumerate(files) if event in s]  # [0] makes it not a list
    if len(index) > 0:  # If there is a match
        index = index[0]
        toLoad[os.path.join(tweet_dir2, item)] = os.path.join(tweet_dir, files[index])

completed = ['BY', 'EG_2016', 'GA', 'HK_2014', 'KR', 'PK', 'UA', 'RU', 'ES', 'VE_2016', 'EG']

for item in toLoad.copy():
    event = item.split('/')[-1]  # Get the file name, is last item in list when split filepath by directory
    event = event[:-4]
    if event in completed:
        toLoad.pop(item)


# Do the data work
for item in toLoad:
    event = item.split('/')[-1]  # Get the file name, is last item in list when split filepath by directory
    event = event[:-4]
    old = read_data2(item)
    new = read_data2(toLoad[item])

    # Subset Donghyeon's for proper threshold. Note that it is redundant because he already gave it to me subsetted at .827. Keep this way because once Alex has added his scene detection, we will want the selection ability.
    old = old[old['protest_result.protest'] > th]

    # Merge Alex's with Donghyeon
    df = old.merge(new, on=['id'], how='left')

    df = df.drop(['city_use'], axis=1)  # Came in from Donghyeon's code, don't want since will add now.

    df = get_city_county(df)
    #temp = [item.decode('utf-8')]
    df.index = range(len(df))
    df.to_csv(tweet_dir + '/02_DonghyeonAlexMerged_' + event + '_Threshold' + str(int(th * 1000)) + '.csv', encoding='utf-8', index=False)




